#######################################################
### Python Scraping files for: Pick Your Language   ###
### Published in the Asian Journal of Communication ###
### William O'Brochta                               ###
### Washington University in St. Louis              ###
#######################################################

#Set system encoding to read UTF 8
import sys
reload(sys)
sys.setdefaultencoding('utf8')

#Import Selenium
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
import csv
import pandas as pd
import time
import random

#Set OS directory 'PATHTODIRECTORY' and Google application credentials 'PATHTOKEY'
os.chdir('/Users/PATHTODIRECTORY')
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/PATHTOKEY.json"

from google.cloud import translate
translate_client = translate.Client()


# Create empty lists to store the information
all_titles=[]
all_dates=[]
all_text=[]
all_links=[]

#Run collection with Chrome
driver=webdriver.Chrome()
wait= WebDriverWait(driver,30)
driver.get('https://www.livehindustan.com/search/दंगा/1')

#Find titles of articles
single_title=driver.find_elements_by_xpath('//*[@id="search-khabre"]/div/ul/li[1]/div/div/h4/a')[0]
//*[@id="search-khabre"]/div/ul/li[1]/div/div/h4/a

#Collect data function to collect each link
def collect_data():
    page_number=1
    for page_number in range(1,117):
        try:
            for index in range(1,26):
                single_title=driver.find_elements_by_xpath('//*[@id="search-khabre"]/div/ul/li['+str(index)+']/div/div/h4/a')[0]
                single_link=single_title.get_attribute('href')
                all_links.append(single_link)
            time.sleep(random.uniform(1,5))
            page_number+=1
            driver.get('https://www.livehindustan.com/search/दंगा/'+str(page_number))
            time.sleep(random.uniform(3,6))
        except IndexError:
            time.sleep(5)
            driver.refresh()
            for index in range(1,26):
                single_title=driver.find_elements_by_xpath('//*[@id="search-khabre"]/div/ul/li['+str(index)+']/div/div/h4/a')[0]
                single_link=single_title.get_attribute('href')
                all_links.append(single_link)
            time.sleep(random.uniform(1,5))
            page_number+=1
            driver.get('https://www.livehindustan.com/search/दंगा/'+str(page_number))
            time.sleep(random.uniform(3,6))
        else:
            continue

collect_data()

df = pd.DataFrame({"Link": all_links})
df.to_csv("data_links.csv")


#Visit all the links
driver=webdriver.Chrome()
wait= WebDriverWait(driver,30)

#Export data frame with links
df2 = pd.read_csv('data_links_Hindustan.csv', header=None, names=['Links'])
data_links = df2['Links'].values.tolist()

#Set-up translation
translation = translate_client.translate('दंगा', 'en')
translation['translatedText']

#Run function to visit each link and get title and text translated
def get_text(link_list):
    link_number=1
    for link in link_list:
        try:
            driver.get(link)
            single_title = driver.find_elements_by_xpath('//*[@id="mainContent"]/div[2]/h1')[0]
            single_title = single_title.text
            single_title = translate_client.translate(single_title, 'en')
            single_title = single_title['translatedText']
            single_date = driver.find_elements_by_xpath('//*[@id="leftCol"]/div[1]/div/div/div[2]/ul/li/a')[0]
            single_date = single_date.text
            single_text=driver.find_element_by_class_name('main-story2')
            single_text = single_text.text
            single_text = translate_client.translate(single_text, 'en')
            single_text = single_text['translatedText']
            all_text.append(single_text)
            all_links.append(link)
            all_titles.append(single_title)
            all_dates.append(single_date)
            time.sleep(random.uniform(5,8))
            link_number+=1
            print link_number
        except:
            time.sleep(5)
            driver.refresh()
            single_title = driver.find_elements_by_xpath('//*[@id="mainContent"]/div[2]/h1')[0]
            single_title = single_title.text
            single_title = translate_client.translate(single_title, 'en')
            single_title = single_title['translatedText']
            single_date = driver.find_elements_by_xpath('//*[@id="leftCol"]/div[1]/div/div/div[2]/ul/li/a')[0]
            single_date = single_date.text
            single_text=driver.find_element_by_class_name('main-story2')
            single_text = single_text.text
            single_text = translate_client.translate(single_text, 'en')
            single_text = single_text['translatedText']
            all_text.append(single_text)
            all_links.append(link)
            all_titles.append(single_title)
            all_dates.append(single_date)
            time.sleep(random.uniform(5,8))
            link_number+=1
            print link_number
        else:
            continue

get_text(data_links[0:2878])

#Data frame with text, links, titles, and dates
df3 = pd.DataFrame({"Text": all_text, "Link":all_links, "Title":all_titles, "Date":all_dates})
df3.to_csv("data_links_text.csv", encoding='utf-8')

